{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add vocabulary txt, one line for each word\n",
    "VOCABULARY_TXT_FILEPATH = \"synonyms_en.txt\"\n",
    "OUTPUT_VDB_FILENAME = \"synonyms_en.index\"\n",
    "\n",
    "# Set workspace to Windrecorder dir\n",
    "import os\n",
    "import sys\n",
    "current_dir = os.getcwd()\n",
    "parent_parent_dir = os.path.dirname(os.path.dirname(current_dir))\n",
    "sys.path.append(parent_parent_dir)\n",
    "os.chdir(\"..\")\n",
    "os.chdir(\"..\")\n",
    "SUB_DIR = \"extension\\\\i18n_create_synonyms_embedding_asset\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from windrecorder import file_utils\n",
    "\n",
    "words_list = file_utils.read_txt_as_list(os.path.join(parent_parent_dir, SUB_DIR, VOCABULARY_TXT_FILEPATH))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from windrecorder import img_embed_manager\n",
    "\n",
    "model_cpu = img_embed_manager.get_model(mode=\"cpu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# embedding all words into vector\n",
    "vdb = img_embed_manager.VectorDatabase(vdb_filename=os.path.join(parent_parent_dir, SUB_DIR, OUTPUT_VDB_FILENAME), db_dir=\"\")\n",
    "\n",
    "i = 0\n",
    "all = len(words_list)\n",
    "for word in words_list:\n",
    "    print(f\"{i}/{all}, {word=}\")\n",
    "    vector = img_embed_manager.embed_text(model=model_cpu, text_query=word, detach_numpy=False)\n",
    "    vdb.add_vector(vector=vector, rowid=i)\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['affection', 'affectionate', 'affinity', 'romantic', 'cherish']\n"
     ]
    }
   ],
   "source": [
    "# test querying result\n",
    "vector_search = img_embed_manager.embed_text(model=model_cpu, text_query=\"love\", detach_numpy=True)\n",
    "prob_res = vdb.search_vector(vector=vector_search, k=5)\n",
    "word_res = [words_list[i[0]] for i in prob_res]\n",
    "print(word_res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save vector database to file\n",
    "vdb.save_to_file()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# [optional] clean txt\n",
    "# Dictionaries can be preprocessed as needed. Here we only take the words before the first space of each line, and then remove all words with less than 2 words.\n",
    "\n",
    "def process_file(input_file, output_file):\n",
    "    with open(input_file, 'r', encoding='utf8') as f:\n",
    "        lines = f.readlines()\n",
    "\n",
    "    processed_lines = []\n",
    "\n",
    "    for line in lines:\n",
    "        if ' ' in line:\n",
    "            line = line.split(' ')[0]\n",
    "\n",
    "        line = line.strip()\n",
    "\n",
    "        if len(line) >= 2:\n",
    "            processed_lines.append(line + '\\n')\n",
    "\n",
    "    with open(output_file, 'w', encoding='utf8') as f:\n",
    "        f.writelines(processed_lines)\n",
    "\n",
    "\n",
    "process_file(os.path.join(parent_parent_dir, SUB_DIR, \"TOEFL.txt\"), os.path.join(parent_parent_dir, SUB_DIR, \"synonyms_en.txt\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
